library(dplyr)
library(xtable)
library(summarytools)
library(caret)
library("ggpubr")
require(investr)
source("funs.R")
languages <- c("Arabic", "Basque", "Catalan", "Chinese", "Czech", "English", "Greek", "Hungarian", "Italian", "Turkish")
With a line at 4 sd from the mean to identify outliers.
data <- read_language("Czech")
#setEPS()
#postscript(paste("outliers.eps",sep='_'),width = 7, height = 3.5)
data <- find_outliers(data,"Czech",remove=TRUE)
[1] "outliers removed for Czech : 43"
#dev.off()
We remove them and plot the data again.
plot(data$vertices, data$degree_2nd_moment, xlab = "vertices", ylab = "degree 2nd moment")
We suspect heteroskedasticity. Thus, we aggregate the data grouping on the number of vertices. This works very well for short sentences, but as we can see from the boxplot sentences longer than 60 words circa are more rare, meaning that the smoothing effect of averaging will not be in place, and aggregation will not lead to an accurate approximation of the k second moment.
mean_data <- aggregate(data, list(data$vertices), mean)
#setEPS()
#postscript(paste("long_sentences.eps",sep='_'),width = 7, height = 3.5)
par(mfrow=c(1,2))
plot(mean_data$vertices, mean_data$degree_2nd_moment, xlab = "vertices", ylab = "degree 2nd moment")
abline(v=50, col= 'red')
boxplot(data$vertices)
abline(h=50, col= 'red')
#dev.off()
Run the first model
# MODEL 1
## Obtain initial parameters with linear regression
linear_model_1 = lm(log(degree_2nd_moment)~log(vertices), mean_data)
b_initial = coef(linear_model_1)[2]
## Perform nonlinear regression
nonlinear_model_1 = nls(degree_2nd_moment~(vertices/2)^b, data = mean_data,
start = list(b = b_initial), trace = FALSE)
## Get AIC, RSS and obtained coefficients
s_1 <- round(sqrt(deviance(nonlinear_model_1)/df.residual(nonlinear_model_1)),4)
AIC_1 <- round(AIC(nonlinear_model_1),4)
b_1 <- round(coef(nonlinear_model_1)["b"],4)
We could try to downweight the observations corresponding to sentences longer than 50 words, since they have a strong influence on the model, but they are not as representative as the first ones. We can see a plot of the DFBETAS computed with the Jackknife procedure, confirming the previous statement.
require(nlstools)
Loading required package: nlstools
'nlstools' has been loaded.
IMPORTANT NOTICE: Most nonlinear regression models and data set examples
related to predictive microbiolgy have been moved to the package 'nlsMicrobio'
jack_1 <- nlsJack(nonlinear_model_1)
dfbetas <- jack_1$dfb
threshold <- 2/sqrt(nrow(mean_data))
#setEPS()
#postscript(paste("dfbetas.eps",sep='_'),width = 7, height = 3.5)
plot(mean_data$vertices, dfbetas, xlab = "vertices", ylab = "jackknife DFBETAS")
abline(h=threshold, col= 'blue')
#dev.off()
source("LAB4_script.R")
[1] "Invalid values for Arabic : 0"
[1] "Invalid values for Basque : 0"
[1] "Invalid values for Catalan : 0"
[1] "Invalid values for Chinese : 0"
[1] "Invalid values for Czech : 0"
[1] "Invalid values for English : 0"
[1] "Invalid values for Greek : 0"
[1] "Invalid values for Hungarian : 0"
[1] "Invalid values for Italian : 0"
[1] "Invalid values for Turkish : 0"
languages <- c("Arabic", "Basque", "Catalan", "Chinese", "Czech", "English", "Greek", "Hungarian", "Italian", "Turkish")
langs_4p <- c("Arabic", "Basque", "Catalan","Czech", "English", "Greek", "Hungarian", "Italian")
langs_0 <- c("Chinese","Turkish")
for (l in 1:length(languages)) {
lang <- languages[l]
data <- read_language(lang)
# remove outliers
data <- find_outliers(data,lang,remove = T)
# aggregate
mean_lang = aggregate(data, list(data$vertices), mean)
if (lang %in% langs_4p) {
model <- "4_p"
a_opt <- coefficients_df$`4+: a`[coefficients_df$Language==lang]
d_opt <- coefficients_df$`4+: d`[coefficients_df$Language==lang]
#setEPS()
#postscript(paste(lang,model,"screened.eps",sep='_'))
plot(mean_lang$vertices, mean_lang$degree_2nd_moment, xlab = "vertices", ylab = "degree 2nd moment")
lines(mean_lang$vertices,a_opt*log(mean_lang$vertices) + d_opt, col = "red")
lines(mean_lang$vertices,4-6/mean_lang$vertices, col = "blue")
lines(mean_lang$vertices,mean_lang$vertices-1, col = "blue")
#dev.off()
}
if (lang %in% langs_0) {
model <- "0"
#setEPS()
#postscript(paste(lang,model,"screened.eps",sep='_'))
plot(mean_lang$vertices, mean_lang$degree_2nd_moment, xlab = "vertices", ylab = "degree 2nd moment")
lines(mean_lang$vertices,(1 - 1/mean_lang$vertices)*(5 - 6/mean_lang$vertices), col = "red")
lines(mean_lang$vertices,4-6/mean_lang$vertices, col = "blue")
lines(mean_lang$vertices,mean_lang$vertices-1, col = "blue")
#dev.off()
}
}
[1] "outliers removed for Arabic : 7"
[1] "outliers removed for Basque : 2"
[1] "outliers removed for Catalan : 24"
[1] "outliers removed for Chinese : 11"
[1] "outliers removed for Czech : 43"
[1] "outliers removed for English : 52"
[1] "outliers removed for Greek : 8"
[1] "outliers removed for Hungarian : 25"
[1] "outliers removed for Italian : 7"
[1] "outliers removed for Turkish : 0"
source("funs.R")
robust_half_CI <- get_coeff_CI(robust=TRUE)
[1] "outliers removed for Arabic : 7"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Basque : 2"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Catalan : 24"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Chinese : 11"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Czech : 43"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for English : 52"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Greek : 8"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Hungarian : 25"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Italian : 7"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Turkish : 0"
Warning in sqrt(nlsvcovhc(nonlinear_model_3p)) : NaNs produced
half_CI <- get_coeff_CI(robust=FALSE)
[1] "outliers removed for Arabic : 7"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Basque : 2"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Catalan : 24"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Chinese : 11"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Czech : 43"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for English : 52"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Greek : 8"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Hungarian : 25"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Italian : 7"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
[1] "outliers removed for Turkish : 0"
Warning in sqrt(vcov(nonlinear_model_3p)) : NaNs produced
CI_data <- cbind(languages, coefficients_df[,14],robust_half_CI,half_CI)
colnames(CI_data) <- c("language","3+: c","robust_half_CI_c","half_CI_c")
colors <- c("Robust CI" = "Red", "CI" = "Orange")
#setEPS()
#postscript(paste("robust_sd.eps",sep='_'),width = 7, height = 3.5)
ggplot(CI_data) +
geom_bar( aes(x=language, y=`3+: c`), stat="identity", fill="skyblue", alpha=1) +
geom_errorbar( aes(x=language, ymin=`3+: c`- robust_half_CI_c, ymax=`3+: c`+robust_half_CI_c, colour="Robust CI"),
width=0.6, alpha=1, size=1) +
geom_errorbar( aes(x=language, ymin=`3+: c`-half_CI_c, ymax=`3+: c`+half_CI_c, colour="CI"), width=0.6, alpha=1, size=1) +
labs(color = "Legend") +
scale_color_manual(values = colors) +
ylim(-0.020,0.020)
#dev.off()